# This file mainly presents how to get Pathway 3's related analysis results

# Results can be obtained in this file include:

    ## Appendix 4.2: Logistic Regression Results (Pathway 3)
    ## Table 6: Predictive Accuracy Results  (Pathway 3)
    ## Table 8: Variance Inflation Factor Values of Variables (Pathway 3 part)
    ## Table 9: Models’ performance metrics (Pathway 3 part)

## R version: 4.4.2

#-----------------Prepare the libraries and the dataset------------------------#

rm(list = ls())  

# Libraries  
library(readxl)  
library(tidyverse)  
library(fixest)        
library(lme4)          
library(pROC)          
library(ResourceSelection) 
library(car)
library(knitr)

options(scipen = 100)  

# Input the dataset 

PSIdata <- read_excel("~/Desktop/PSI.xlsx")
PSIdata <- PSIdata %>%  
  mutate(  
    Country = as.factor(Country),  
    Year = as.factor(as.character(Year))
  )  


#--------------Appendix 4.2: Logistic Regression Results (Pathway 3)-----------#

# 1) Create train/test splits -------------------------------------------------  

# text_type==0 means UNPSA text; text_type==1 means synthetic (AI-generated) text

train_data8 <- PSIdata %>% filter(text_type == 1, Year %in% c("2018", "2019"))  
test_data8  <- PSIdata %>% filter(text_type == 0)  

train_data9 <- PSIdata %>% filter(text_type == 1, Year %in% c("2018", "2019", "2020"))  
test_data9  <- PSIdata %>% filter(text_type == 0)  

train_data10 <- PSIdata %>% filter(text_type == 1, Year %in% c("2018", "2019", "2020", "2021"))  
test_data10  <- PSIdata %>% filter(text_type == 0) 

train_data11 <- PSIdata %>% filter(text_type == 1, Year %in% c("2018", "2019", "2020", "2021", "2022"))  
test_data11  <- PSIdata %>% filter(text_type == 0) 

# predictor vector  

predictors <- c("innovation", "impact", "replicability",  
                "subjectivity", "numeric", "emotionality", "certainty","wordcount")  
pred_formula_rhs <- paste(predictors, collapse = " + ")  


# 3) Regression models------------------------------------------------

fit_glmer <- function(train_df, model_name) {  
  re_terms <- c()  
  if (n_distinct(train_df$Country) > 1) re_terms <- c(re_terms, "(1|Country)")  
  if (n_distinct(train_df$Year) > 1)    re_terms <- c(re_terms, "(1|Year)")  
  re_part <- if (length(re_terms) > 0) paste("+", paste(re_terms, collapse = " + ")) else ""  
  fmla <- as.formula(paste0("win ~ ", pred_formula_rhs, " ", re_part))  
  message("Fitting GLMM for ", model_name, ": ", deparse(fmla))  
  glmer(fmla, data = train_df, family = binomial(link = "logit"),  
        control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5)))  
}  

# Fit all four GLMM models
model8 <- fit_glmer(train_data8, "Model 8")  
model9 <- fit_glmer(train_data9, "Model 9")  
model10 <- fit_glmer(train_data10, "Model 10")  
model11 <- fit_glmer(train_data11, "Model 11")  

# Extract coefficients from GLMM models
extract_glmm_coefficients <- function(model, model_name) {
  coef_summary <- summary(model)$coefficients
  coef_df <- as.data.frame(coef_summary)
  coef_df$Variable <- rownames(coef_df)
  coef_df$Model <- model_name
  coef_df <- coef_df[, c("Model", "Variable", "Estimate", "z value", "Pr(>|z|)")]
  names(coef_df) <- c("Model", "Variable", "Coefficient", "z_value", "p_value")
  
  coef_df$Significance <- ifelse(coef_df$p_value < 0.001, "***",
                                 ifelse(coef_df$p_value < 0.01, "**",
                                        ifelse(coef_df$p_value < 0.05, "*", "")))
  return(coef_df)
}

# 4) Output results-------------------------------------------------------------

# Model 8 regression results---------------------------------------------------- 
coef_model8 <- extract_glmm_coefficients(model8, "Model 8 (Train: 2018-2019)")
kable(coef_model8[, c("Variable", "Coefficient", "z_value", "p_value", "Significance")],
      digits = 3,
      row.names = FALSE,
      caption = "MODEL 8 (UNPSA Winner of 2018-2022)")
cat("N (Train) =", nrow(train_data8), "observations\n")
cat("N (Test) =", nrow(test_data8), "observations\n\n")


# Model 9 regression results---------------------------------------------------- 
coef_model9 <- extract_glmm_coefficients(model9, "Model 9 (Train: 2018-2020)")
kable(coef_model9[, c("Variable", "Coefficient", "z_value", "p_value", "Significance")],
      digits = 3,
      row.names = FALSE,
      caption = "MODEL 9 (UNPSA Winner of 2018-2022)")
cat("N (Train) =", nrow(train_data9), "observations\n")
cat("N (Test) =", nrow(test_data9), "observations\n\n")


# Model 10 regression results---------------------------------------------------- 
coef_model10 <- extract_glmm_coefficients(model10, "Model 10 (Train: 2018-2021)")
kable(coef_model10[, c("Variable", "Coefficient", "z_value", "p_value", "Significance")],
      digits = 3,
      row.names = FALSE,
      caption = "MODEL 10 (UNPSA Winner of 2018-2022)")
cat("N (Train) =", nrow(train_data10), "observations\n")
cat("N (Test) =", nrow(test_data10), "observations\n\n")


# Model 11 regression results---------------------------------------------------- 
coef_model11 <- extract_glmm_coefficients(model11, "Model 11 (Train: 2018-2021)")
kable(coef_model11[, c("Variable", "Coefficient", "z_value", "p_value", "Significance")],
      digits = 3,
      row.names = FALSE,
      caption = "MODEL 11 (UNPSA Winner of 2018-2022)")
cat("N (Train) =", nrow(train_data11), "observations\n")
cat("N (Test) =", nrow(test_data11), "observations\n\n")


#-------------Table 6: Predictive Accuracy Results  (Pathway 3)---------------#

# 1) calculate the model performance metrics------------------------------------

calculate_pr_auc <- function(actual, probs) {
  if (length(actual) != length(probs)) {
    stop("Not match")
  }
  
  # remove NA value
  valid_idx <- !is.na(actual) & !is.na(probs)
  actual <- actual[valid_idx]
  probs <- probs[valid_idx]
  
  if (sum(actual) == 0) {
    return(0)
  }
  
  # calculate Precision-Recall curve
  pr_curve <- data.frame(
    threshold = seq(0, 1, by = 0.01),
    precision = NA,
    recall = NA
  )
  
  for (i in 1:nrow(pr_curve)) {
    threshold <- pr_curve$threshold[i]
    pred_class <- ifelse(probs >= threshold, 1, 0)
    
    # Calculate the confusion matrix
    tp <- sum(actual == 1 & pred_class == 1)
    tn <- sum(actual == 0 & pred_class == 0)
    fp <- sum(actual == 0 & pred_class == 1)
    fn <- sum(actual == 1 & pred_class == 0)
    
    # calculate Precision and Recall
    precision <- ifelse((tp + fp) > 0, tp / (tp + fp), 1)
    recall <- ifelse((tp + fn) > 0, tp / (tp + fn), 0)
    
    pr_curve$precision[i] <- precision
    pr_curve$recall[i] <- recall
  }
  
  pr_curve <- pr_curve[order(pr_curve$recall, -pr_curve$precision), ]
  pr_curve <- pr_curve[!duplicated(pr_curve$recall), ]
  
  # calculate the PR AUC 
  pr_auc <- 0
  for (i in 2:nrow(pr_curve)) {
    width <- pr_curve$recall[i] - pr_curve$recall[i-1]
    height <- (pr_curve$precision[i] + pr_curve$precision[i-1]) / 2
    pr_auc <- pr_auc + width * height
  }
  
  return(pr_auc)
}

calculate_metrics <- function(actual, predicted, probs) {
  conf_matrix <- table(actual, predicted)
  accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
  recall <- conf_matrix[2,2] / sum(conf_matrix[2,])
  specificity <- conf_matrix[1,1] / sum(conf_matrix[1,])
  precision <- conf_matrix[2,2] / sum(conf_matrix[,2])
  f1_score <- 2 * (precision * recall) / (precision + recall)
  
  # ROC AUC
  roc_obj <- roc(actual, probs)
  auc_val <- auc(roc_obj)
  
  # PR AUC
  pr_auc_val <- calculate_pr_auc(actual, probs)
  
  return(list(
    confusion_matrix = conf_matrix,
    accuracy = accuracy,
    recall = recall,
    specificity = specificity,
    precision = precision,
    f1_score = f1_score,
    auc = auc_val,
    pr_auc = pr_auc_val
  ))
}

pred_prob8 <- predict(model8, newdata = test_data8, type = "response", allow.new.levels = TRUE)  
pred_prob9 <- predict(model9, newdata = test_data9, type = "response", allow.new.levels = TRUE)  
pred_prob10 <- predict(model10, newdata = test_data10, type = "response", allow.new.levels = TRUE)  
pred_prob11 <- predict(model11, newdata = test_data11, type = "response", allow.new.levels = TRUE)  

pred_class8 <- ifelse(pred_prob8 >= 0.5, 1, 0)  
pred_class9 <- ifelse(pred_prob9 >= 0.5, 1, 0)  
pred_class10 <- ifelse(pred_prob10 >= 0.5, 1, 0)  
pred_class11 <- ifelse(pred_prob11 >= 0.5, 1, 0)  

# 2) output the predictive accuracy results (Table 6)---------------------------

metrics8 <- calculate_metrics(test_data8$win, pred_class8, pred_prob8)
metrics9 <- calculate_metrics(test_data9$win, pred_class9, pred_prob9)
metrics10 <- calculate_metrics(test_data10$win, pred_class10, pred_prob10)
metrics11 <- calculate_metrics(test_data11$win, pred_class11, pred_prob11)

# Predictive Accuracy Results of Model 8----------------------------------------
print(metrics8) 

# Predictive Accuracy Results of Model 9----------------------------------------
print(metrics9) 

# Predictive Accuracy Results of Model 10--------------------------------------
print(metrics10) 

# Predictive Accuracy Results of Model 11---------------------------------------
print(metrics11) 

#---------Table 8: Variance Inflation Factor Values of Variables--------------#
# Models 8, 9, 10， 11

round(vif(model8), 3)
round(vif(model9), 3)
round(vif(model10), 3)
round(vif(model11), 3)

#-------------------------Table 9: Models’ performance metrics----------------#
# Models 8, 9, 10, 11 part

# 1) construct the performance table--------------------------------------------

performance_table <- data.frame(
  Model = c("Model 8", "Model 9", "Model 10","Model 11"),
  Precision = c(metrics8$precision, metrics9$precision, metrics10$precision, metrics11$precision),
  Recall = c(metrics8$recall, metrics9$recall, metrics10$recall, metrics11$recall),
  F1_Score = c(metrics8$f1_score, metrics9$f1_score, metrics10$f1_score, metrics11$f1_score),
  PR_AUC = c(metrics8$pr_auc, metrics9$pr_auc, metrics10$pr_auc, metrics11$pr_auc)
)

performance_table[, -1] <- round(performance_table[, -1], 3)

# 2) Output the performance results in Table 9---------------------------------------------

cat("\nPerformance Metrics Table:\n")
print(performance_table, row.names = FALSE)

